This is a tutorial script to producing an interactive visualization of words in a publication list extracted from Google Scholar. The appearance of the words is dependent on the frequency of their occurrence, and on which words they co-occur with in the titles of the listed articles.
The code is for R and it is, among others, based on packages ggplot2, plotly and htmlWidgets.
This document is part of the Github repository ScholarWordGraph by Tommi Suvitaival.
View this document at https://tommi-s.com/ScholarWordGraph/ to show all output correctly.
data.loaded <-
readr::read_delim(
file = "data/publications.txt",
delim = "\t",
escape_double = FALSE,
col_names = FALSE,
trim_ws = TRUE
)
## Rows: 96 Columns: 1
## -- Column specification --------------------------------------------------------
## Delimiter: "\t"
## chr (1): X1
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
data <- unlist( data.loaded )
## Warning: One or more parsing issues, see `problems()` for details
data <-
matrix(
data = unlist( data ),
ncol = 3,
byrow = TRUE
)
colnames( data ) <-
c(
"Title",
"Authors",
"Journal"
)
data <-
data.frame(
data,
stringsAsFactors = FALSE
)
data$"Title.lower" <-
tolower( data$"Title" )
data$"Title.lower" <-
stringr::str_replace_all(
string = data$"Title.lower",
# pattern = "type 1 diabete",
pattern = "type 1 diabetes",
replacement = "type_1_diabetes"
)
data$"Title.lower" <-
stringr::str_replace_all(
string = data$"Title.lower",
pattern = "type 2 diabetes",
replacement = "type_2_diabetes"
)
words <-
stringr::str_split(
string = data[ , "Title.lower" ],
pattern = "\\s"
)
words <-
lapply(
X = words,
FUN = tolower
)
words <-
lapply(
X = words,
FUN = stringr::str_replace_all,
pattern = "(\\()|(\\))|(\\:)|(\\,)|(\\.)",
replacement = ""
)
table.words <-
sort(
x = table( unlist( words ) ),
decreasing = TRUE
)
head( table.words )
##
## in of and the with for
## 25 21 17 13 11 8
words.unique <-
sort(
unique(
unlist( words ) )
)
blocklist <-
c(
"and",
"an",
"after",
"are",
"as",
"at",
"based",
"by",
"do",
"during",
"for",
"from",
"in",
"is",
"of",
"on",
"not",
"the",
"to",
"through",
"with",
"without",
"...",
letters
)
blocklist <-
c(
blocklist,
words.unique[ grepl( x = words.unique, pattern = "^[0-9]+$" ) ]
)
words <-
lapply(
X = words,
FUN = function( x ) {
x[ !( x %in% blocklist ) ]
}
)
table.words <-
sort(
x = table( unlist( words ) ),
decreasing = TRUE
)
head( table.words )
##
## type_1_diabetes analysis associated plasma data
## 6 5 5 5 4
## lipidomics
## 4
words.unique <-
sort(
unique(
unlist( words ) )
)
wba <-
array(
data = 0,
dim = c( length( words.unique ), nrow( data ) )
)
rownames( wba ) <- words.unique
colnames( wba ) <- data[ , "Title" ]
for ( i in 1:nrow( data ) ) {
wba[ words[[ i ]], i ] <- 1
}
wba.log1px <- log10( 1 + wba )
wba.norm <- scale( wba.log1px )
result.pca <-
princomp(
x = wba.norm
)
library( "ggfortify" )
## Warning: package 'ggfortify' was built under R version 4.0.5
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.0.5
plot <-
autoplot(
result.pca
# ,
# loadings = TRUE,
# loadings.label = TRUE
)
print( plot )
result.clustering <-
cluster::pam(
x = result.pca$scores[ , 1:2 ], # based on PCA scores
# x = wba.norm, # alternatively, based on original high-dimensional data
k = 9
)
data.plot <-
data.frame(
Word = rownames( result.pca$"scores" ),
result.pca$"scores"
)
data.plot$"Cluster" <-
factor(
x = result.clustering$"clustering",
levels = 1:length( unique( result.clustering$"clustering" ) ),
labels =
c(
"Cluster (C) 1",
paste( "C", 2:length( unique( result.clustering$"clustering" ) ) )
)
)
data.plot$"Distance" <-
sqrt( data.plot$"Comp.1"^2 + data.plot$"Comp.2"^2 ) + 1
data.plot$"Term" <-
stringr::str_replace_all(
string = data.plot$"Word",
pattern = "\\-",
replacement = "-\n"
)
data.plot$"Term" <-
stringr::str_replace_all(
string = data.plot$"Word",
pattern = "\\_",
replacement = "\n"
)
data.plot$"Publications" <-
apply(
X = wba[ data.plot$"Word", ] == 1,
MAR = 1,
FUN = function( x ) {
paste(
"\n",
names(
which( x )
),
collapse = "\n"
)
}
)
data.plot <-
data.plot[
order( data.plot$"Distance", decreasing = FALSE ),
]
jitter <- 1
plot <-
ggplot2::ggplot(
data = data.plot,
mapping =
ggplot2::aes(
x = Comp.2,
y = Comp.1,
label = Term,
size = Distance,
text = Publications
,
color = Cluster,
)
) +
ggplot2::geom_text(
position =
ggplot2::position_jitter(
height = jitter,
width = jitter
)
) +
ggplot2::scale_x_continuous( trans = "pseudo_log" ) +
ggplot2::scale_y_continuous( trans = "pseudo_log" ) +
ggplot2::theme(
axis.text = ggplot2::element_blank(),
axis.ticks = ggplot2::element_blank()
) +
ggplot2::xlab( label = "\n\n\nMap of my research topics" ) +
ggplot2::ylab( label = "" ) +
ggplot2::scale_color_brewer( palette = "Set1" )
plot
library( tidyr )
plot.interactive <-
plotly::ggplotly(
p = plot,
tooltip = c( "label", "Publications" )
) %>%
plotly::layout(
legend =
list(
orientation = "h",
title = list( text = "Cluster<br>" ),
x = quantile( x = data.plot$"Comp.2", probs = 0.67 ),
y = min( data.plot$"Comp.1" )
)
)
plot.interactive
(Uncomment for saving the files.)
widget <- plotly::partial_bundle( plot.interactive )
# dir.create( "output" )
#
# htmlwidgets::saveWidget( widget, "output/index.html" )
<iframe
id="igraph"
scrolling="no"
style="border:none;"
seamless="seamless"
src="output/index.html"
height="800"
width="800"
>
widget
</iframe>
plot.dark <-
plot +
ggplot2::theme(
axis.text = ggplot2::element_blank(),
axis.ticks = ggplot2::element_blank(),
axis.title = ggplot2::element_text( color = "gray" ),
legend.background = ggplot2::element_rect( color = NA, fill = "black" ),
legend.key = ggplot2::element_rect( color = "gray", fill = "gray" ),
legend.text = ggplot2::element_text( color = "gray" ),
legend.title = ggplot2::element_text( color = "gray" ),
panel.background = ggplot2::element_rect( fill = "black", color = NA ),
panel.grid = ggplot2::element_blank(),
plot.background = ggplot2::element_rect( color = "black", fill = "black" )
)
plot.dark
library( tidyr )
plot.dark.interactive <-
plotly::ggplotly(
p = plot.dark,
tooltip = c( "label", "Publications" )
) %>%
plotly::layout(
legend =
list(
orientation = "h",
title = list( text = "Cluster<br>" ),
x = quantile( x = data.plot$"Comp.2", probs = 0.67 ),
y = min( data.plot$"Comp.1" )
)
)
plot.dark.interactive
(Uncomment for saving the files.)
widget <- plotly::partial_bundle( plot.dark.interactive )
# dir.create( "output-dark" )
#
# htmlwidgets::saveWidget( widget, "output-dark/index.html" )
<iframe
id="igraph"
scrolling="no"
style="border:none;"
seamless="seamless"
src="output-dark/index.html"
height="800"
width="800"
>
widget
</iframe>
utils::sessionInfo()
## R version 4.0.4 (2021-02-15)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19042)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_United States.1252
## [2] LC_CTYPE=English_United States.1252
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.1252
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] tidyr_1.1.2 ggfortify_0.4.12 ggplot2_3.3.5
##
## loaded via a namespace (and not attached):
## [1] tidyselect_1.1.1 xfun_0.27 purrr_0.3.4 colorspace_2.0-0
## [5] vctrs_0.3.8 generics_0.1.0 htmltools_0.5.1.1 viridisLite_0.4.0
## [9] yaml_2.2.1 utf8_1.1.4 plotly_4.9.4.1 rlang_0.4.11
## [13] jquerylib_0.1.4 pillar_1.6.2 glue_1.4.2 withr_2.4.2
## [17] bit64_4.0.5 RColorBrewer_1.1-2 lifecycle_1.0.0 stringr_1.4.0
## [21] munsell_0.5.0 gtable_0.3.0 htmlwidgets_1.5.4 evaluate_0.14
## [25] labeling_0.4.2 knitr_1.34 tzdb_0.1.2 crosstalk_1.1.1
## [29] curl_4.3 parallel_4.0.4 fansi_0.4.2 highr_0.9
## [33] readr_2.0.1 scales_1.1.1 vroom_1.5.5 jsonlite_1.7.2
## [37] farver_2.1.0 bit_4.0.4 gridExtra_2.3 hms_1.1.0
## [41] digest_0.6.27 stringi_1.5.3 dplyr_1.0.4 grid_4.0.4
## [45] cli_3.0.1 tools_4.0.4 magrittr_2.0.1 lazyeval_0.2.2
## [49] tibble_3.0.6 cluster_2.1.2 crayon_1.4.1 pkgconfig_2.0.3
## [53] ellipsis_0.3.2 data.table_1.13.6 rmarkdown_2.11 httr_1.4.2
## [57] rstudioapi_0.13 R6_2.5.1 compiler_4.0.4
if ( file.exists( "README.html" ) ) {
system( command = "rm index.html" )
system( command = "cp README.html index.html" )
}
## [1] 0